pip install openpyxl
Requirement already satisfied: openpyxl in c:\users\sajja\anaconda3\envs\dab200\lib\site-packages (3.1.2) Requirement already satisfied: et-xmlfile in c:\users\sajja\anaconda3\envs\dab200\lib\site-packages (from openpyxl) (1.1.0) Note: you may need to restart the kernel to use updated packages.
pip install plotly
Collecting plotly
Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
---------------------------------------- 0.0/15.6 MB ? eta -:--:--
---------------------------------------- 0.0/15.6 MB ? eta -:--:--
--------------------------------------- 0.0/15.6 MB 487.6 kB/s eta 0:00:32
--------------------------------------- 0.4/15.6 MB 3.3 MB/s eta 0:00:05
-- ------------------------------------- 0.9/15.6 MB 5.6 MB/s eta 0:00:03
---- ----------------------------------- 1.6/15.6 MB 7.9 MB/s eta 0:00:02
------ --------------------------------- 2.4/15.6 MB 9.6 MB/s eta 0:00:02
-------- ------------------------------- 3.4/15.6 MB 11.3 MB/s eta 0:00:02
----------- ---------------------------- 4.3/15.6 MB 12.6 MB/s eta 0:00:01
------------- -------------------------- 5.4/15.6 MB 13.9 MB/s eta 0:00:01
---------------- ----------------------- 6.6/15.6 MB 15.6 MB/s eta 0:00:01
-------------------- ------------------- 7.9/15.6 MB 16.2 MB/s eta 0:00:01
----------------------- ---------------- 9.2/15.6 MB 17.2 MB/s eta 0:00:01
-------------------------- ------------ 10.5/15.6 MB 21.8 MB/s eta 0:00:01
----------------------------- --------- 11.8/15.6 MB 25.1 MB/s eta 0:00:01
-------------------------------- ------ 13.2/15.6 MB 27.3 MB/s eta 0:00:01
------------------------------------ -- 14.5/15.6 MB 28.5 MB/s eta 0:00:01
-------------------------------------- 15.6/15.6 MB 28.5 MB/s eta 0:00:01
-------------------------------------- 15.6/15.6 MB 28.5 MB/s eta 0:00:01
--------------------------------------- 15.6/15.6 MB 24.2 MB/s eta 0:00:00
Requirement already satisfied: packaging in c:\users\sajja\anaconda3\envs\dab200\lib\site-packages (from plotly) (23.0)
Collecting tenacity>=6.2.0
Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.18.0 tenacity-8.2.3
Note: you may need to restart the kernel to use updated packages.
# Import the Pandas library
import pandas as pd
# Load your dataset
df = pd.read_csv('MonthlyRetailTradeE-commerceSales(x1,000)_012017To082023.csv')
print(df.head())
REF_DATE GEO DGUID \
0 2017-01 Canada 2016A000011124
1 2017-01 Canada 2016A000011124
2 2017-01 Canada 2016A000011124
3 2017-01 Canada 2016A000011124
4 2017-01 Canada 2016A000011124
North American Industry Classification System (NAICS) \
0 Retail trade [44-45]
1 Retail trade [44-45]
2 Retail trade [44-45]
3 Retail trade [44-45]
4 Motor vehicle and parts dealers [441]
Sales Adjustments UOM UOM_ID \
0 Total retail sales Unadjusted Dollars 81
1 Total retail sales Seasonally adjusted Dollars 81
2 Retail e-commerce sales Unadjusted Dollars 81
3 Retail e-commerce sales Seasonally adjusted Dollars 81
4 Total retail sales Unadjusted Dollars 81
SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL \
0 thousands 3 v1446859481 1.1.1.1 41377009.0 NaN NaN
1 thousands 3 v1446859483 1.1.1.2 50417235.0 NaN NaN
2 thousands 3 v1446859482 1.1.2.1 1110045.0 NaN NaN
3 thousands 3 v1446859484 1.1.2.2 1236885.0 NaN NaN
4 thousands 3 v1446859485 1.2.1.1 10162441.0 NaN NaN
TERMINATED DECIMALS
0 NaN 0
1 NaN 0
2 NaN 0
3 NaN 0
4 NaN 0
import pandas as pd
import numpy as np
# Load your dataset
df = pd.read_csv('MonthlyRetailTradeE-commerceSales(x1,000)_012017To082023.csv')
# Check for missing values
missing_values = df.isnull().sum()
missing_values
REF_DATE 0 GEO 0 DGUID 0 North American Industry Classification System (NAICS) 0 Sales 0 Adjustments 0 UOM 0 UOM_ID 0 SCALAR_FACTOR 0 SCALAR_ID 0 VECTOR 0 COORDINATE 0 VALUE 7735 STATUS 40341 SYMBOL 54342 TERMINATED 54342 DECIMALS 0 dtype: int64
import pandas as pd
# Remove rows with missing 'VALUE' entries
df_cleaned = df.dropna(subset=['VALUE'])
# Verify the shape of the cleaned dataset
print("Shape of cleaned dataset:", df_cleaned.shape)
Shape of cleaned dataset: (46607, 17)
# Assuming your cleaned DataFrame is named 'df_cleaned' and you want to save it as a CSV file
#df_cleaned.to_csv('MonthlyRetailTradeE-commerceSales(x1,000)_012017To082023_cleaned_dataset.csv', index=False)
df_cleaned.to_excel('MonthlyRetailTradeE-commerceSales_012017To082023_cleaned_dataset.xlsx', index=False)
df_cleaned.head()
| REF_DATE | GEO | DGUID | North American Industry Classification System (NAICS) | Sales | Adjustments | UOM | UOM_ID | SCALAR_FACTOR | SCALAR_ID | VECTOR | COORDINATE | VALUE | STATUS | SYMBOL | TERMINATED | DECIMALS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01 | Canada | 2016A000011124 | Retail trade [44-45] | Total retail sales | Unadjusted | Dollars | 81 | thousands | 3 | v1446859481 | 1.1.1.1 | 41377009.0 | NaN | NaN | NaN | 0 |
| 1 | 2017-01 | Canada | 2016A000011124 | Retail trade [44-45] | Total retail sales | Seasonally adjusted | Dollars | 81 | thousands | 3 | v1446859483 | 1.1.1.2 | 50417235.0 | NaN | NaN | NaN | 0 |
| 2 | 2017-01 | Canada | 2016A000011124 | Retail trade [44-45] | Retail e-commerce sales | Unadjusted | Dollars | 81 | thousands | 3 | v1446859482 | 1.1.2.1 | 1110045.0 | NaN | NaN | NaN | 0 |
| 3 | 2017-01 | Canada | 2016A000011124 | Retail trade [44-45] | Retail e-commerce sales | Seasonally adjusted | Dollars | 81 | thousands | 3 | v1446859484 | 1.1.2.2 | 1236885.0 | NaN | NaN | NaN | 0 |
| 4 | 2017-01 | Canada | 2016A000011124 | Motor vehicle and parts dealers [441] | Total retail sales | Unadjusted | Dollars | 81 | thousands | 3 | v1446859485 | 1.2.1.1 | 10162441.0 | NaN | NaN | NaN | 0 |
# Get the summary statistics
print(df_cleaned.describe())
UOM_ID SCALAR_ID VALUE SYMBOL TERMINATED DECIMALS count 46607.0 46607.0 4.660700e+04 0.0 0.0 46607.0 mean 81.0 3.0 1.171890e+06 NaN NaN 0.0 std 0.0 0.0 4.122422e+06 NaN NaN 0.0 min 81.0 3.0 1.000000e+00 NaN NaN 0.0 25% 81.0 3.0 3.806250e+04 NaN NaN 0.0 50% 81.0 3.0 1.677500e+05 NaN NaN 0.0 75% 81.0 3.0 6.889420e+05 NaN NaN 0.0 max 81.0 3.0 7.288349e+07 NaN NaN 0.0
import pandas as pd
import matplotlib.pyplot as plt
# Display basic statistics of numerical columns
numerical_summary = df_cleaned.describe()
numerical_summary
# Create a histogram for the 'VALUE' column
plt.figure(figsize=(8, 6))
n, bins, patches = plt.hist(df_cleaned['VALUE'], bins=20, color='skyblue')
plt.title('Distribution of VALUE')
plt.xlabel('VALUE')
plt.ylabel('Frequency')
# Initialize arrays to store values and coordinates
#value_array = []
#coordinates_array = []
# Annotate each bar with its frequency and coordinates (rotated 90 degrees)
for rect in patches:
height = rect.get_height()
x_coord = rect.get_x() + rect.get_width() / 2
y_coord = height
plt.annotate(f'({x_coord:.2f}, {y_coord:.2f})', xy=(x_coord, y_coord + 50), ha='center', va='bottom', rotation=90)
# Append values and coordinates to respective arrays
#value_array.append(height)
# coordinates_array.append((x_coord, y_coord))
plt.show()
# Print the arrays
#print("Value Array:", value_array)
#print("Coordinates Array:", coordinates_array)
# Create a bar plot for the 'GEO' column with clear count labels
geo_counts = df_cleaned['GEO'].value_counts()
plt.figure(figsize=(10, 6))
ax = geo_counts.plot(kind='bar', color='lightcoral')
plt.title('Distribution of GEO')
plt.xlabel('GEO')
plt.ylabel('Count')
# Annotate the counts above the bars
for i, count in enumerate(geo_counts):
ax.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)
plt.xticks(rotation=90)
plt.tight_layout() # Adjust plot layout for better visibility
plt.show()
import plotly.express as px
# Define the number of date labels to display (e.g., 10 labels)
num_labels = 25
# Calculate the step size to evenly select date labels
step = max(1, len(df['REF_DATE']) // num_labels)
# Create a subset of dates to display
selected_dates = df['REF_DATE'][::step]
# Create an interactive line plot using Plotly Express
fig = px.line(df, x='REF_DATE', y='VALUE', title='Sales Trends Over Time', labels={'REF_DATE': 'Date', 'VALUE': 'Sales (in Thousand Dollars)'})
fig.update_xaxes(tickvals=selected_dates, tickangle=45)
fig.show()
# Count unique values in categorical columns
categorical_columns = ['GEO', 'North American Industry Classification System (NAICS)', 'Adjustments']
for col in categorical_columns:
unique_values = df_cleaned[col].value_counts()
print(f'Unique values in {col}:')
print(unique_values)
print()
Unique values in GEO: Canada 4918 Ontario 2459 Alberta 2459 Montréal, Quebec 2457 British Columbia 2453 Quebec 2452 Toronto, Ontario 2425 Manitoba 2408 Saskatchewan 2399 Vancouver, British Columbia 2380 New Brunswick 2290 Nova Scotia 2281 Newfoundland and Labrador 1999 Prince Edward Island 1702 Calgary, Alberta 1658 Quebec, Quebec 1579 Winnipeg, Manitoba 1574 Gatineau, Quebec 1566 Edmonton, Alberta 1541 Ottawa, Ontario 1511 Northwest Territories 805 Yukon 800 Nunavut 491 Name: GEO, dtype: int64 Unique values in North American Industry Classification System (NAICS): Retail trade [44-45] 3216 Motor vehicle and parts dealers [441] 1724 Health and personal care retailers [456] 1662 Food and beverage retailers [445] 1656 Supermarkets and other grocery retailers (except convenience retailers) [44511] 1656 Gasoline stations and fuel vendors [457] 1654 Sporting goods, hobby, musical instrument, book, and miscellaneous retailers [459] 1623 General merchandise retailers [455] 1616 Electronics and appliances retailers [4492] 1591 Furniture, home furnishings, electronics and appliances retailers [449] 1589 Miscellaneous retailers [459B] 1584 Automobile dealers [4411] 1575 Grocery and convenience retailers [4451] 1556 Clothing, clothing accessories, shoes, jewelry, luggage and leather goods retailers [458] 1555 Convenience retailers and vending machine operators [44513] 1548 Furniture, floor covering, window treatment and other home furnishings retailers [4491] 1547 Automotive parts, accessories and tire retailers [4413] 1547 Other motor vehicle dealers [4412] 1521 Building material and garden equipment and supplies dealers [444] 1488 Furniture retailers [44911] 1476 New car dealers [44111] 1476 Sporting goods, hobby, musical instrument, book retailers and news dealers [459A] 1446 Clothing and clothing accessories retailers [4581] 1427 Floor covering, window treatment and other home furnishing retailers [44912] 1416 Cannabis retailers [459993] 1273 Beer, wine and liquor retailers [4453] 1271 Used car dealers [44112] 1263 Shoe retailers [4582] 1258 Jewellery, luggage and leather goods retailers [4583] 1237 Specialty food retailers [4452] 1156 Name: North American Industry Classification System (NAICS), dtype: int64 Unique values in Adjustments: Unadjusted 42868 Seasonally adjusted 3739 Name: Adjustments, dtype: int64
import pandas as pd
# Load the dataset from a CSV file
df = df_cleaned
# Initialize an empty DataFrame to store the comparison results
comparison_df = pd.DataFrame(columns=["REF_DATE", "GEO", "North American Industry Classification System (NAICS)", "Difference"])
# Get unique categories for which you want to compare values
categories = df["North American Industry Classification System (NAICS)"].unique()
# Loop through categories and compare "Unadjusted" and "Seasonally adjusted" values
for category in categories:
category_data = df[df["North American Industry Classification System (NAICS)"] == category]
unadjusted = category_data[category_data["Adjustments"] == "Unadjusted"]
seasonally_adjusted = category_data[category_data["Adjustments"] == "Seasonally adjusted"]
# Check if both "Unadjusted" and "Seasonally adjusted" data are available
if len(unadjusted) > 0 and len(seasonally_adjusted) > 0:
# Ensure the DataFrames have the same length before subtraction
min_length = min(len(unadjusted), len(seasonally_adjusted))
diff = seasonally_adjusted["VALUE"].values[:min_length] - unadjusted["VALUE"].values[:min_length]
comparison_df = comparison_df.append({
"REF_DATE": unadjusted["REF_DATE"].values[0],
"GEO": unadjusted["GEO"].values[0],
"North American Industry Classification System (NAICS)": category,
"Difference": diff,
}, ignore_index=True)
# Print the comparison results
print(comparison_df)
REF_DATE GEO North American Industry Classification System (NAICS) \
0 2017-01 Canada Retail trade [44-45]
1 2017-01 Canada Motor vehicle and parts dealers [441]
2 2017-01 Canada Automobile dealers [4411]
3 2017-01 Canada New car dealers [44111]
4 2017-01 Canada Used car dealers [44112]
5 2017-01 Canada Other motor vehicle dealers [4412]
6 2017-01 Canada Automotive parts, accessories and tire retaile...
7 2017-01 Canada Building material and garden equipment and sup...
8 2017-01 Canada Food and beverage retailers [445]
9 2017-01 Canada Grocery and convenience retailers [4451]
10 2017-01 Canada Supermarkets and other grocery retailers (exce...
11 2017-01 Canada Convenience retailers and vending machine oper...
12 2017-01 Canada Specialty food retailers [4452]
13 2017-01 Canada Beer, wine and liquor retailers [4453]
14 2017-01 Canada Furniture, home furnishings, electronics and a...
15 2017-01 Canada Furniture, floor covering, window treatment an...
16 2017-01 Canada Furniture retailers [44911]
17 2017-01 Canada Floor covering, window treatment and other hom...
18 2017-01 Canada Electronics and appliances retailers [4492]
19 2017-01 Canada General merchandise retailers [455]
20 2017-01 Canada Health and personal care retailers [456]
21 2017-01 Canada Gasoline stations and fuel vendors [457]
22 2017-01 Canada Clothing, clothing accessories, shoes, jewelry...
23 2017-01 Canada Clothing and clothing accessories retailers [4...
24 2017-01 Canada Shoe retailers [4582]
25 2017-01 Canada Jewellery, luggage and leather goods retailers...
26 2017-01 Canada Sporting goods, hobby, musical instrument, boo...
27 2017-01 Canada Sporting goods, hobby, musical instrument, boo...
28 2017-01 Canada Miscellaneous retailers [459B]
29 2018-10 Canada Cannabis retailers [459993]
Difference
0 [9040226.0, 126840.0, 182689.0, 45730.0, 26214...
1 [3399086.0, 12926819.0, 13323656.0, 13272052.0...
2 [2846376.0, 11355056.0, 11761008.0, 11613633.0...
3 [2716176.0, 10436986.0, 10615666.0, 10732703.0...
4 [130200.0, 918070.0, 928797.0, 956232.0, 71419...
5 [366675.0, 833194.0, 801177.0, 873755.0, 67821...
6 [186036.0, 738569.0, 761472.0, 782576.0, 79635...
7 [913853.0, 2883769.0, 2841831.0, 3017570.0, 25...
8 [1161041.0, 9997687.0, 10050652.0, 9877735.0, ...
9 [488145.0, 7524109.0, 7582046.0, 7400919.0, 75...
10 [402687.0, 6873402.0, 6932724.0, 6768383.0, 69...
11 [85458.0, 650708.0, 649322.0, 632536.0, 635986...
12 [126791.0, 474877.0, 521468.0, 464788.0, 56094...
13 [546105.0, 1698622.0, 1780965.0, 1346124.0, 17...
14 [422768.0, 3116812.0, 3135627.0, 3123924.0, 27...
15 [222806.0, 1651598.0, 1597010.0, 1609602.0, 13...
16 [122917.0, 1016750.0, 976866.0, 983560.0, 8114...
17 [99888.0, 621255.0, 622389.0, 565396.0, 578461...
18 [199962.0, 1461656.0, 1550633.0, 1509768.0, 15...
19 [1101249.0, 5650483.0, 5824153.0, 5860496.0, 5...
20 [178247.0, 3956670.0, 3967850.0, 3948422.0, 39...
21 [590050.0, 5334490.0, 5238522.0, 5266784.0, 51...
22 [828713.0, 2861514.0, 2768332.0, 2854327.0, 28...
23 [618563.0, 2227376.0, 2157844.0, 2192824.0, 21...
24 [110200.0, 351308.0, 327607.0, 361651.0, 29369...
25 [99952.0, 282830.0, 277727.0, 301379.0, 289002...
26 [445220.0, 2549047.0, 2549879.0, 2564029.0, 25...
27 [200015.0, 1137805.0, 1144872.0, 1160480.0, 92...
28 [245203.0, 1411243.0, 1374204.0, 1411103.0, 12...
29 [0.0, 53251.0, 57997.0, 51177.0, 49890.0, 5083...
C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
import pandas as pd
import matplotlib.pyplot as plt
# Load the dataset from a CSV file
df = df_cleaned
# Initialize an empty DataFrame to store the summary statistics
summary_df = pd.DataFrame(columns=["Category", "Mean Difference", "Min Difference", "Max Difference"])
# Get unique categories for which you want to compare values
categories = df["North American Industry Classification System (NAICS)"].unique()
# Loop through categories and compare "Unadjusted" and "Seasonally adjusted" values
for category in categories:
category_data = df[df["North American Industry Classification System (NAICS)"] == category]
unadjusted = category_data[category_data["Adjustments"] == "Unadjusted"]
seasonally_adjusted = category_data[category_data["Adjustments"] == "Seasonally adjusted"]
# Check if both "Unadjusted" and "Seasonally adjusted" data are available
if len(unadjusted) > 0 and len(seasonally_adjusted) > 0:
# Ensure the DataFrames have the same length before subtraction
min_length = min(len(unadjusted), len(seasonally_adjusted))
diff = seasonally_adjusted["VALUE"].values[:min_length] - unadjusted["VALUE"].values[:min_length]
# Calculate summary statistics
mean_diff = diff.mean()
min_diff = diff.min()
max_diff = diff.max()
summary_df = summary_df.append({
"Category": category,
"Mean Difference": mean_diff,
"Min Difference": min_diff,
"Max Difference": max_diff,
}, ignore_index=True)
# Create a bar plot to visualize mean differences
plt.figure(figsize=(12, 6))
plt.bar(summary_df["Category"], summary_df["Mean Difference"])
plt.xlabel("NAICS Category")
plt.ylabel("Mean Difference (Seasonally Adjusted - Unadjusted)")
plt.title("Mean Differences Between Seasonally Adjusted and Unadjusted Data")
plt.xticks(rotation=90)
plt.tight_layout()
# Show the plot
plt.show()
C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:42: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations.
import matplotlib.pyplot as plt
# Calculate total sales for each sector
sector_sales = df_cleaned.groupby('North American Industry Classification System (NAICS)')['VALUE'].sum()
# Calculate the total sales across all sectors
total_sales = sector_sales.sum()
# Calculate the percentages
sector_percentages = (sector_sales / total_sales) * 100
# Sort the sectors by percentages in descending order
sector_percentages = sector_percentages.sort_values(ascending=False)
# Define a color palette for the bars
colors = plt.cm.viridis(sector_percentages / max(sector_percentages))
# Create a bar graph to visualize sales percentages by industry
plt.figure(figsize=(10, 8))
bars = plt.barh(sector_percentages.index, sector_percentages.values, color=colors)
plt.xlabel('Percentage of Total Sales', fontsize=14)
plt.ylabel('Industry', fontsize=14)
plt.title('Sales Percentages by Industry', fontsize=16)
plt.gca().invert_yaxis()
# Add labels to the bars
for sector, percentage in zip(sector_percentages.index, sector_percentages.values):
plt.text(percentage + 1, sector, f'{percentage:.2f}%', va='center', fontsize=12)
plt.show()
import pandas as pd
import plotly.express as px
# Extract relevant columns
sales_data = df_cleaned[['REF_DATE', 'North American Industry Classification System (NAICS)', 'VALUE']]
# Convert REF_DATE to datetime
sales_data['REF_DATE'] = pd.to_datetime(sales_data['REF_DATE'])
# Group data by industry (North American Industry Classification System (NAICS)) and year
sales_by_industry = sales_data.groupby(['North American Industry Classification System (NAICS)', sales_data['REF_DATE'].dt.year])['VALUE'].sum().unstack().reset_index()
# Melt the data for use in Plotly Express
melted_data = pd.melt(sales_by_industry, id_vars='North American Industry Classification System (NAICS)', var_name='Year', value_name='Total Sales')
# Create an interactive line plot using Plotly Express
fig = px.line(
melted_data,
x='Year',
y='Total Sales',
color='North American Industry Classification System (NAICS)',
labels={'Total Sales': 'Total Sales (in thousand dollars)'},
title='Sales Trends by Industry'
)
fig.update_layout(
xaxis_title='Year',
yaxis_title='Total Sales (in thousand dollars)',
legend_title='Industry',
xaxis_tickangle=-45, # Rotate x-axis labels by 45 degrees
width=1000, # Increase the width of the plot
height=1000, # Increase the height of the plot
)
fig.show()
C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3411904461.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Group the data by the "North American Industry Classification System (NAICS)" column
industry_sales = df_cleaned.groupby("North American Industry Classification System (NAICS)")["VALUE"].sum()
# Convert the sales data to billions (divide by 1,000,000)
industry_sales_billion = industry_sales / 1_000_000
# Sort the industry sales data in descending order
industry_sales_billion = industry_sales_billion.sort_values(ascending=False)
# Display the sales data segmented by industry in billions
print(industry_sales_billion)
North American Industry Classification System (NAICS) Retail trade [44-45] 22299.302008 Motor vehicle and parts dealers [441] 4183.757364 Automobile dealers [4411] 3655.440550 New car dealers [44111] 3302.001521 Food and beverage retailers [445] 3152.090122 Grocery and convenience retailers [4451] 2372.693771 Supermarkets and other grocery retailers (except convenience retailers) [44511] 2183.357735 General merchandise retailers [455] 1928.070526 Gasoline stations and fuel vendors [457] 1577.571979 Health and personal care retailers [456] 1247.231226 Building material and garden equipment and supplies dealers [444] 961.486139 Furniture, home furnishings, electronics and appliances retailers [449] 959.451792 Sporting goods, hobby, musical instrument, book, and miscellaneous retailers [459] 886.844768 Clothing, clothing accessories, shoes, jewelry, luggage and leather goods retailers [458] 852.435220 Clothing and clothing accessories retailers [4581] 657.609805 Beer, wine and liquor retailers [4453] 550.403423 Miscellaneous retailers [459B] 542.210850 Furniture, floor covering, window treatment and other home furnishings retailers [4491] 502.519572 Electronics and appliances retailers [4492] 457.181169 Sporting goods, hobby, musical instrument, book retailers and news dealers [459A] 339.998045 Used car dealers [44112] 335.497963 Furniture retailers [44911] 318.112023 Other motor vehicle dealers [4412] 263.036074 Automotive parts, accessories and tire retailers [4413] 260.821328 Specialty food retailers [4452] 209.560161 Convenience retailers and vending machine operators [44513] 192.304048 Floor covering, window treatment and other home furnishing retailers [44912] 184.654946 Shoe retailers [4582] 95.100273 Jewellery, luggage and leather goods retailers [4583] 93.556810 Cannabis retailers [459993] 53.980520 Name: VALUE, dtype: float64